#0) define libraries
import pandas as pd
import numpy as np
from os import listdir
from functools import reduce

# =============================================================================
#1) Define options
pd.set_option('display.max_columns', None)


# =============================================================================
#2) Read data
#df = pd.read_spss("C:/Users/admin/Desktop/IDrive-Sync/Analysis - Code/Postdoc/COVID/Data/COVID-19 production 05012020 anonymized.sav")
#df = pd.read_spss("C:/Users/admin/Desktop/IDrive-Sync/Analysis - Code/Postdoc/COVID/Data/COVID-19 FINAL DATA_CAP.sav")
df = pd.read_stata("C:/Users/admin/Desktop/IDrive-Sync/Analysis - Code/Postdoc/COVID/Data/P2P-COVID/COVID-19 FINAL DATA_NO CAP.dta")
vals = df.iloc[0]


# =============================================================================
#3) Clean P2P data
#3.1) Rename/remove variables
paper1_vars = {
    "SU_ID":"id",
    "FIPS":"state_county_name",
    "econ1":"u_employed", 
    "econ2":"dv_inc_reduced", 
    "econ3":"dv_hours_reduced",
    "econ4":"dv_work_home",
    "econ5":"dv_work_office",
    "econ6":"dv_change_job",
    "econ7":"dv_second_job",
    "econ8":"dv_child_affect_work",
    "econ9":"dv_fired",
    "econ10":"dv_unable_find_work",
    "covid2f":"dv_worry_hosp_overwhelm",
    "covid2i":"dv_worry_home_insecure",
    "covid2j":"dv_worry_food_insecure",
    "covid2k":"dv_worry_finances",
    "network2d":"dv_likey_to_receive_services",
    "network2e":"dv_likey_to_receive_loans",
    "network2f":"dv_likey_to_receive_food",
    "health8":"dv_stopped_medications",
    "health9_8":"dv_uninsured",
    "DOB_DATE": "cv_age",
    "GENDER_ID":"cv_sex_female",
    "EDUCATION":"cv_education",
    "VETERAN":"cv_veteran",
    "CURRENT_LIVING":"cv_own_home",
    "MARITAL_STATUS":"cv_marital",
    "RACE_AMIND":"iv_race_other1",
    "RACE_ASIAN":"iv_race_other2",
    "RACE_BLACK_AFAM":"iv_race_black",
    "RACE_NATHAW_PACIS":"iv_race_other3",
    "RACE_OTHER":"iv_race_other4",
    "RACE_WHITE":"iv_race_white",
    "ETHNICITY":"iv_race_latino",
    "COUNTRY_ORIGIN":"iv_immigrant",
    "PARENTS_BORN":"iv_immigrant_gen2",
    "NO_GRANDPARENTS_CO":"iv_immigrant_gen3",
    "covid3a":"test_bad_symptoms",
    "covid3b":"test_near_affected",
    "covid3c":"test_afford_treatment",
    "covid3d":"test_hospitals_can_treat",
    "covid3e":"test_disable_working",
    "covid3f":"test_risk_to_others",
    "covid2a":"worried_likely_to_catch",
    "covid2b":"worried_catch",
    "covid2c":"worried_ill",
    "covid2d":"worried_friends_catch",
    "covid2e":"worried_friends_ill",
    "covid4":"covid_symptoms",
    "covid5":"covid_tested",
    "network1a":"covid_friend",
    "network1b_1":"covid_friend_count",
    "health3":"physical_health"
}

supplement_vars = {
    "network1a":"Know_Anyone\nwith_COVID",
    "network1b_1":"Number_Known\nwith_COVID",
    "network1c":"Contact_with\nContagious_Friend",
    "network1d":"Close_Friend_or\nFamily_with_COVID",
    "network2a":"Discuss_Personal\nProblems",
    "network2b":"Discuss_Health",
    "network2c":"Get_Health\nAdvice",
    "network2d":"Get_Help\nwith_Services",
    "network2e":"Get_Help\nwith_Finances",
    "network2f":"Get_Help\nwith_Food",
    "network2g":"Spend_Time_with\nClose_Friends",
    "network2h":"Have_Problems\nwith_Other_People",
    "network2i":"Depended_by\nOther_People",
    "network2j":"Provide_Help_to\nOther_People",
    "network2k":"Reconnect_with\nClose_Friends",
    "network2l":"Use_Technology_\nto_Socialize",
}
tdf = df.rename(columns=supplement_vars)[[*supplement_vars.values()]]
df = df.rename(columns=paper1_vars)[[*paper1_vars.values()]]


#3.2 fix tdf and concatenate into df
for i in range(0, len(tdf.columns)):
    tdf.iloc[:, i] = np.where(tdf.iloc[:, i].isin(["NOT ASKED DUE TO SKIP LOGIC"]), 0, tdf.iloc[:, i])
    tdf.iloc[:, i] = np.where(tdf.iloc[:, i].isin(["DON'T KNOW", "REFUSED"]), np.NaN, tdf.iloc[:, i])
    tdf.iloc[:, i] = np.where(tdf.iloc[:, i] == "Yes", 1, tdf.iloc[:, i])
    tdf.iloc[:, i] = np.where(tdf.iloc[:, i] == "No", 0, tdf.iloc[:, i])
df = pd.concat([df, tdf], axis = 1)

#temp) Check values in df
# for i in range(0, len(tdf.columns)):
#     print(tdf.iloc[:, i].value_counts())


#3.3 create new vars
df = df.assign(
    id = df[['id']].astype(int).astype(str),
    u_employed = np.where(df['u_employed'] == 'Yes', 1, 0),
    dv_inc_reduced = np.where(df['dv_inc_reduced'] == 'Yes', 1, 0),
    dv_hours_reduced = np.where(df['dv_hours_reduced'] == 'Yes', 1, 0),
    dv_work_home = np.where(df['dv_work_home'] == 'Yes', 1, 0),
    dv_work_office = np.where(df['dv_work_office'] == 'Yes', 1, 0),
    dv_change_job = np.where(df['dv_change_job'] == 'Yes', 1, 0),
    dv_second_job = np.where(df['dv_second_job'] == 'Yes', 1, 0),
    dv_child_affect_work = np.where(df['dv_child_affect_work']=='Yes', 1, 0),
    dv_fired = np.where(df['dv_fired']=='Yes', 1, 0),
    dv_unable_find_work = np.where(df['dv_unable_find_work']=='Yes', 1, 0),
    dv_worry_hosp_overwhelm = np.where(df[['dv_worry_hosp_overwhelm']].isin(["REFUSED", "DON'T KNOW"]), np.NaN, df[['dv_worry_hosp_overwhelm']]),
    dv_worry_home_insecure = np.where(df[['dv_worry_home_insecure']].isin(["REFUSED", "DON'T KNOW"]), np.NaN, df[['dv_worry_home_insecure']]),
    dv_worry_food_insecure = np.where(df[['dv_worry_food_insecure']].isin(["REFUSED", "DON'T KNOW"]), np.NaN, df[['dv_worry_food_insecure']]),
    dv_worry_finances = np.where(df[['dv_worry_finances']].isin(["REFUSED", "DON'T KNOW"]), np.NaN, df[['dv_worry_finances']]),
    dv_likey_to_receive_services = np.where(df[['dv_likey_to_receive_services']].isin(["REFUSED", "DON'T KNOW"]), np.NaN, df[['dv_likey_to_receive_services']]),
    dv_likey_to_receive_loans = np.where(df[['dv_likey_to_receive_loans']].isin(["REFUSED", "DON'T KNOW"]), np.NaN, df[['dv_likey_to_receive_loans']]),
    dv_likey_to_receive_food = np.where(df[['dv_likey_to_receive_food']].isin(["REFUSED", "DON'T KNOW"]), np.NaN, df[['dv_likey_to_receive_food']]),
    dv_stopped_medications = np.where(df['dv_stopped_medications']=='Yes', 1, 0),
    dv_uninsured = np.where(df['dv_uninsured']=='SELECTED', 1, 0),
    cv_age = 2020 - pd.DatetimeIndex(df['cv_age']).year,
    cv_sex_female = np.where(df['cv_sex_female'] =='FEMALE', 1, 0),
    cv_educ_bachelors = np.where(df['cv_education']=='College or higher', 1, 0),
    cv_educ_some_college = np.where((df['cv_education']=='Some college (no degree)') | (df['cv_education']=='Technical Certificates/Assoc. Degree'), 1, 0),
    cv_educ_hs = np.where(df['cv_education']=='High school graduate or GED', 1, 0),
    cv_educ_lths = np.where(df['cv_education']=='Less than high school', 1, 0),
    cv_veteran = np.where(df['cv_veteran']=='YES', 1, 0),
    cv_own_home = np.where(df['cv_own_home']=='OWN/BUYING', 1, 0),
    cv_mar_married = np.where(df['cv_marital']=='Married', 1, 0),
    cv_mar_divorced = np.where(df['cv_marital'].isin(['Divorced or annulled', 'Separated']), 1, 0),
    cv_mar_nevermarried = np.where(df['cv_marital'].isin(['Never been married', 'Not married but living with a partner']), 1, 0),
    cv_mar_widowed = np.where(df['cv_marital']=='Widowed', 1, 0),
    iv_race_other = np.where((df['iv_race_other1'] == "SELECTED") | (df['iv_race_other2'] == "SELECTED") | (df['iv_race_other3'] == "SELECTED") | (df['iv_race_other4'] == "SELECTED"), 1, 0),
    iv_race_black = np.where(df['iv_race_black'] == "SELECTED", 1, 0),
    iv_race_white = np.where(df['iv_race_white'] == "SELECTED", 1, 0),
    iv_race_latino = np.where(df['iv_race_latino'] == "YES", 1, 0),
    iv_immigrant = np.where(df['iv_immigrant'] == "YES", 1, 0),
    iv_immigrant_gen2 = np.where(df['iv_immigrant_gen2'] != 'BOTH BORN IN U.S.', 1, 0),
    iv_immigrant_gen3 = np.where(df['iv_immigrant_gen3'] != "NONE", 1, 0),
    test_bad_symptoms = np.where(df[['test_bad_symptoms']].isin(["REFUSED", "DON'T KNOW"]), np.NaN, df[['test_bad_symptoms']]),
    test_near_affected = np.where(df[['test_near_affected']].isin(["REFUSED", "DON'T KNOW"]), np.NaN, df[['test_near_affected']]),
    test_afford_treatment = np.where(df[['test_afford_treatment']].isin(["REFUSED", "DON'T KNOW"]), np.NaN, df[['test_afford_treatment']]),
    test_hospitals_can_treat = np.where(df[['test_hospitals_can_treat']].isin(["REFUSED", "DON'T KNOW"]), np.NaN, df[['test_hospitals_can_treat']]),
    test_disable_working = np.where(df[['test_disable_working']].isin(["REFUSED", "DON'T KNOW"]), np.NaN, df[['test_disable_working']]),
    test_risk_to_others = np.where(df[['test_risk_to_others']].isin(["REFUSED", "DON'T KNOW"]), np.NaN, df[['test_risk_to_others']]),
    worried_friends_catch = np.where(df[['worried_friends_catch']].isin(["REFUSED", "DON'T KNOW"]), np.NaN, df[['worried_friends_catch']]),
    worried_catch = np.where(df[['worried_catch']].isin(["REFUSED", "DON'T KNOW"]), np.NaN, df[['worried_catch']]),
    worried_ill = np.where(df[['worried_ill']].isin(["REFUSED", "DON'T KNOW"]), np.NaN, df[['worried_ill']]),
    worried_likely_to_catch = np.where(df[['worried_likely_to_catch']].isin(["REFUSED", "DON'T KNOW"]), np.NaN, df[['worried_likely_to_catch']]),
    worried_friends_ill = np.where(df[['worried_friends_ill']].isin(["REFUSED", "DON'T KNOW"]), np.NaN, df[['worried_friends_ill']]),
    covid_symptoms = np.where(df[['covid_symptoms']].isin(["REFUSED", "DON'T KNOW"]), np.NaN, np.where(df[['covid_symptoms']] == "Yes", 1, 0)),
    covid_tested = np.where(df[['covid_tested']].isin(["REFUSED", "DON'T KNOW"]), np.NaN, np.where(df[['covid_tested']] == "Yes", 1, 0)),
    covid_friend = np.where(df[['covid_friend']].isin(["REFUSED", "DON'T KNOW"]), np.NaN, np.where(df[['covid_friend']] == "Yes", 1, 0)),
    covid_friend_count = np.where(df[['covid_friend_count']].isin(["NOT ASKED DUE TO SKIP LOGIC"]), np.NaN, df[['covid_friend_count']]).astype('float64')
)
df = df.assign(
    iv_race_black  = np.where((df['iv_race_latino'] == 1) | (df['iv_race_other'] == 1), 0, df['iv_race_black']),
    iv_race_white = np.where((df['iv_race_latino'] == 1) | (df['iv_race_other'] == 1), 0, df['iv_race_white']),
    iv_race_other = np.where((df['iv_race_latino'] == 1), 0, df['iv_race_other'])
)


# =============================================================================
#4) pull in sample weights and fips codes
#4.1) read data
df_acs = pd.read_csv("C:/Users/admin/Desktop/IDrive-Sync/Analysis - Code/Postdoc/COVID/Data/Other/acs_county_sample_weights.csv")

#4.2) remove states outside indiana
df_acs = df_acs[df_acs['state_county'].astype(str).str.startswith("18")]

#4.3) match county names
df["state_county_name"] = df["state_county_name"].str.lower()
df["county_name"] = df["state_county_name"].str.replace("indiana, ", " ")
df["county_name"] = df["county_name"].str.replace(" ", "")
df_acs['state_county_name'] = df_acs["state_county_name"].str.lower()
df_acs["county_name"] = df_acs["state_county_name"].str.replace(", indiana", "")
df_acs["county_name"] = df_acs["county_name"].str.replace(" county", "")
df_acs["county_name"] = df_acs["county_name"].str.replace(" ", "")
#np.isin(df["county_name"].unique(), df_acs["county_name"].unique())

#4.4) match other key variables
df = df.assign(
    race = 
        np.where(df['iv_race_other'] == 1, "other",
        np.where(df['iv_race_black'] == 1, "black",
        np.where(df['iv_race_white'] == 1, "white",
        np.where(df['iv_race_latino'] == 1, "latino", "")))),
    age_group = 
        np.where(df['cv_age'] < 15, "0-14", 
        np.where(df['cv_age'] < 15, "0-14", 
        np.where(df['cv_age'] < 25, "15-24", 
        np.where(df['cv_age'] < 35, "25-34", 
        np.where(df['cv_age'] < 45, "35-44", 
        np.where(df['cv_age'] < 55, "45-54", 
        np.where(df['cv_age'] < 65, "55-64", "65P"))))))),
    sex = np.where(df['cv_sex_female']== 1, 'female', 'male')
)

#4.5) rename variables and create state-level weights
df_acs = df_acs.rename(columns = {'age.group':'age_group'})
df_acs = df_acs.rename(columns = {'weight':'acs_group_prop'})

#4.6) merge weights and fips codes into df
del(df_acs['state_county_name'])
df = pd.merge(df, df_acs,  how='left', on = ['county_name', 'race', 'sex', 'age_group'])
#df['weight'] = df['weight'] / df['weight'].mean()

#4.7) determine extent to which each observation should be re-weighted
df['group_pop'] = df.groupby(['county_name', 'race', 'sex', 'age_group'])['race'].transform('count')
df['county_pop'] = df.groupby(['county_name'])['county_name'].transform('count')
df['group_prop'] = df['group_pop'] / df['county_pop']
df['weight_county'] = df['acs_group_prop'] / df['group_prop']
df['weight_county'] = df['weight_county'] / df['weight_county'].mean()
df = df.drop(['group_prop', 'acs_group_prop', 'county_pop', "group_pop", 'iv_race_other1', 'iv_race_other2', 'iv_race_other3', 'iv_race_other4'], axis = 1)

#4.8) merge in state weights
df_acs = pd.read_csv("C:/Users/admin/Desktop/IDrive-Sync/Analysis - Code/Postdoc/COVID/Data/Other/acs_state_sample_weights.csv")
df_acs = df_acs[df_acs['state'] == 18]
df_acs = df_acs.rename(columns = {'age.group':'age_group'})
df_acs = df_acs.rename(columns = {'weight':'acs_group_prop'})
df_acs = df_acs.drop('state', axis = 1)
df = pd.merge(df, df_acs,  how='left', on = ['race', 'sex', 'age_group'])

#4.9) determine extent to which each observation should be re-weighted
df['group_prop'] = df.groupby(['race', 'sex', 'age_group'])['race'].transform('count') / df['id'].count()
df['weight_state'] = df['acs_group_prop'] / df['group_prop']
df['weight_state'] = df['weight_state'] / df['weight_state'].mean()
df = df.drop(['group_prop', 'acs_group_prop'], axis = 1)


# =============================================================================
#5.0) merge in acs variables
#5.1) read data
df_acs = pd.read_csv("C:/Users/admin/Desktop/IDrive-Sync/Analysis - Code/Postdoc/COVID/Data/Other/acs_sample.csv")

#5.2) rename all columns
df_acs.columns = df_acs.columns.str.replace(".", "_")

#5.3) remove states outside indiana
df_acs = df_acs[df_acs['state_county'].astype(str).str.startswith("18")]
df_acs = df_acs.drop(['year', 'state'], axis = 1)

#5.4) merge into df
df = pd.merge(df, df_acs,  how='left', on = ['state_county'])


# =============================================================================

# crosswalk = pd.read_stata("C:/Users/admin/Desktop/IDrive-Sync/Analysis - Code/Postdoc/COVID/Data/P2P-COVID/county.dta")
# #crosswalk.COUNTY = crosswalk.COUNTY.str.lower()
# #(df.county_name == crosswalk.COUNTY)
# df["id"] = crosswalk["SU_ID"]


#6) pull in relevant p2p vars
#6.1) identify files to read (excluding ones with irregular structures)
p2p_dir = 'C:/Users/admin/Desktop/IDrive-Sync/Analysis - Code/Postdoc/COVID/Data/P2P-2020-05'
f_names = listdir(p2p_dir)
f_names  = list(np.array(f_names)[~np.isin(f_names, ['egocentric_networks.dta', 'field_interviewer_comments.dta', 'egocentric_networks_pairs.dta'])])
f_paths = (p2p_dir + '/' + pd.Series(f_names)).tolist()


#6.2) read all files and merge
l_p2p = list(pd.read_stata(f_paths[i]) for i in range(0, len(f_paths)))
for tdf in l_p2p:
    tdf[['SU_ID']] = tdf[['SU_ID']].astype(int).astype(str)
p2p = reduce(lambda df1, df2: pd.merge(df1, df2, how='outer', on='SU_ID'), l_p2p)


#6.3) get variable labels
p2p_dir = 'C:/Users/admin/Desktop/IDrive-Sync/Analysis - Code/Postdoc/COVID/Data/P2P-2020-05'
f_names = listdir(p2p_dir)
f_paths = (p2p_dir + '/' + pd.Series(f_names)).tolist()
labels_p2p = list(pd.read_stata(f_paths[i], iterator = True) for i in range(0, len(f_paths)))
labels = {}
for i in range(0, len(labels_p2p)):
    labels.update(labels_p2p[i].variable_labels())
labels = pd.DataFrame(labels.items())
labels.to_csv('/Users/admin/Desktop/IDrive-Sync/Analysis - Code/Postdoc/COVID/Data/p2p_labels.csv', index=False)


#6.4) create variables of interest
p2p = p2p.assign(
    id = p2p['SU_ID'].astype(int).astype(str),
    pre_emp_full_time = np.where(p2p[['EMP_STATUS']] == "Working full time", 1, 0),
    pre_emp_part_time = np.where(p2p[['EMP_STATUS']] == "Working part time", 1, 0),
    pre_unemployed = np.where(p2p[['EMP_STATUS']].isin(["Unemployed looking for work", "Unemployed not looking for work"]), 1, 0),
    pre_self_employed = np.where(p2p[['SELF_EMPLOYED']] == "SELF-EMPLOYED", 1, 0),
    pre_food_insecure = np.where(p2p[['NO_FOOD']].isin(["REFUSED", "DON'T KNOW"]), np.NaN, p2p[['NO_FOOD']]),
	pre_home_insecure = np.where(p2p[['NO_SHELTER']].isin(["REFUSED", "DON'T KNOW"]), np.NaN, p2p[['NO_SHELTER']]),
    pre_child_number = np.where(p2p[['CHILDREN_NO']] == "NOT ASKED DUE TO SKIP LOGIC", 0, p2p[['CHILDREN_NO']]),
    pre_occ_farming = np.where(p2p[['FARMING']] == "SELECTED", 1, 0),
    pre_occ_automotive = np.where(p2p[['AUTOMOTIVE']] == "SELECTED", 1, 0),
    pre_occ_construction = np.where(p2p[['CONSTRUCTION']] == "SELECTED", 1, 0),
    pre_occ_manufacturing = np.where(p2p[['MANUFACTURING']] == "SELECTED", 1, 0),
    pre_occ_railroad = np.where(p2p[['RAILROAD']] == "SELECTED", 1, 0),
    pre_occ_forestry = np.where(p2p[['FORESTRY']] == "SELECTED", 1, 0),
    pre_occ_electrical = np.where(p2p[['ELECTRICAL']] == "SELECTED", 1, 0),
    pre_occ_beauty = np.where(p2p[['BEAUTY']] == "SELECTED", 1, 0),
    pre_occ_fire = np.where(p2p[['FIRE']] == "SELECTED", 1, 0),
    pre_occ_other1 = p2p['OTHERINDUSTRY1'],
    pre_occ_other2 = p2p['OTHERINDUSTRY2'],
    pre_smoker = p2p['CURRENT_SMOKE'],
    pre_diabetes = p2p['FH_DIABETES_SELF']
)

#6.5) drop other variables
p2p = p2p[['id', 'pre_emp_full_time', 'pre_emp_part_time', 'pre_self_employed', 'pre_unemployed', 'pre_food_insecure', 'pre_home_insecure', 'pre_child_number', 'pre_occ_farming', 'pre_occ_automotive', 'pre_occ_construction', 'pre_occ_manufacturing', 'pre_occ_railroad', 'pre_occ_forestry', 'pre_occ_electrical', 'pre_occ_beauty', 'pre_occ_fire', 'pre_smoker', 'pre_diabetes']]


#6.6) merge into df
df = pd.merge(df, p2p,  how='left', on = 'id')

#Still missing 400 observations...?
#df.id.isin(p2p.id).value_counts()
#sorted(p2p.id.astype(int))[0:5]
#sorted(df.id.astype(int))[0:5]
#sorted(df.id[~df.id.isin(p2p.id)])[0:5]
#sorted(p2p.id[~p2p.id.isin(df.id)])[0:5]

#7) save
df.to_csv('/Users/admin/Desktop/IDrive-Sync/Analysis - Code/Postdoc/COVID/Data/p2p_covid.csv', index=False)


# =============================================================================
# df.weight_county.describe()
# df.weight_state.describe()
# np.corrcoef(df.weight_county.values, df.weight_state.values)
# 
# 
# df[df.weight_county == min(df.weight_county)]
# sorted(df.weight.unique())[0:10]
# df.weight.quantile([.02])
# 
# df.cv_age.describe()
# df_acs['age.group'].unique()
# df["state_county_name"].value_counts()
# 
# df.groupby(['county_name', 'race', 'sex', 'age_group']).size()
# a = p2p.SU_ID.astype("int")\
#   .isin(df.SU_ID.astype("int"))
# p2p[a].END_MATERIAL_FINISHTIME.quantile()
# p2p[a].END_MATERIAL_FINISHTIME.describe()

# =============================================================================



#8) Play with network data
#df_net = pd.read_stata('C:/Users/admin/Desktop/IDrive-Sync/Analysis - Code/Postdoc/COVID/Data/P2P-2020-05/egocentric_networks.dta')
#df_adj_list = pd.read_stata('C:/Users/admin/Desktop/IDrive-Sync/Analysis - Code/Postdoc/COVID/Data/P2P-2020-05/egocentric_networks_pairs.dta')
